SVM Classifier WBCD data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
In [9]:
df = pd.read_csv("wbcd.csv")
In [7]:
df.head()
Out[7]:
id diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean points_mean ... radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst points_worst symmetry_worst dimension_worst
0 87139402 B 12.32 12.39 78.85 464.1 0.10280 0.06981 0.03987 0.03700 ... 13.50 15.64 86.97 549.1 0.1385 0.1266 0.12420 0.09391 0.2827 0.06771
1 8910251 B 10.60 18.95 69.28 346.4 0.09688 0.11470 0.06387 0.02642 ... 11.88 22.94 78.28 424.8 0.1213 0.2515 0.19160 0.07926 0.2940 0.07587
2 905520 B 11.04 16.83 70.92 373.2 0.10770 0.07804 0.03046 0.02480 ... 12.41 26.44 79.93 471.4 0.1369 0.1482 0.10670 0.07431 0.2998 0.07881
3 868871 B 11.28 13.39 73.00 384.8 0.11640 0.11360 0.04635 0.04796 ... 11.92 15.77 76.53 434.0 0.1367 0.1822 0.08669 0.08611 0.2102 0.06784
4 9012568 B 15.19 13.21 97.65 711.8 0.07963 0.06934 0.03393 0.02657 ... 16.20 15.73 104.50 819.1 0.1126 0.1737 0.13620 0.08178 0.2487 0.06766

5 rows × 32 columns

In [12]:
df.corr()
Out[12]:
id radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean points_mean symmetry_mean ... radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst points_worst symmetry_worst dimension_worst
id 1.000000 0.074626 0.099770 0.073159 0.096893 -0.012968 0.000096 0.050080 0.044158 -0.022114 ... 0.082405 0.064720 0.079986 0.107187 0.010338 -0.002968 0.023203 0.035174 -0.044224 -0.029866
radius_mean 0.074626 1.000000 0.323782 0.997855 0.987357 0.170581 0.506124 0.676764 0.822529 0.147741 ... 0.969539 0.297008 0.965137 0.941082 0.119616 0.413463 0.526911 0.744214 0.163953 0.007066
texture_mean 0.099770 0.323782 1.000000 0.329533 0.321086 -0.023389 0.236702 0.302418 0.293464 0.071401 ... 0.352573 0.912045 0.358040 0.343546 0.077503 0.277830 0.301025 0.295316 0.105008 0.119205
perimeter_mean 0.073159 0.997855 0.329533 1.000000 0.986507 0.207278 0.556936 0.716136 0.850977 0.183027 ... 0.969476 0.303038 0.970387 0.941550 0.150549 0.455774 0.563879 0.771241 0.189115 0.051019
area_mean 0.096893 0.987357 0.321086 0.986507 1.000000 0.177028 0.498502 0.685983 0.823269 0.151293 ... 0.962746 0.287489 0.959120 0.959213 0.123523 0.390410 0.512606 0.722017 0.143570 0.003738
smoothness_mean -0.012968 0.170581 -0.023389 0.207278 0.177028 1.000000 0.659123 0.521984 0.553695 0.557775 ... 0.213120 0.036072 0.238853 0.206718 0.805324 0.472468 0.434926 0.503053 0.394309 0.499316
compactness_mean 0.000096 0.506124 0.236702 0.556936 0.498502 0.659123 1.000000 0.883121 0.831135 0.602641 ... 0.535315 0.248133 0.590210 0.509604 0.565541 0.865809 0.816275 0.815573 0.510223 0.687382
concavity_mean 0.050080 0.676764 0.302418 0.716136 0.685983 0.521984 0.883121 1.000000 0.921391 0.500667 ... 0.688236 0.299879 0.729565 0.675987 0.448822 0.754968 0.884103 0.861323 0.409464 0.514930
points_mean 0.044158 0.822529 0.293464 0.850977 0.823269 0.553695 0.831135 0.921391 1.000000 0.462497 ... 0.830318 0.292752 0.855923 0.809630 0.452753 0.667454 0.752399 0.910155 0.375744 0.368661
symmetry_mean -0.022114 0.147741 0.071401 0.183027 0.151293 0.557775 0.602641 0.500667 0.462497 1.000000 ... 0.185728 0.090651 0.219169 0.177193 0.426675 0.473200 0.433721 0.430297 0.699826 0.438413
dimension_mean -0.052511 -0.311631 -0.076437 -0.261477 -0.283110 0.584792 0.565369 0.336783 0.166917 0.479921 ... -0.253691 -0.051269 -0.205151 -0.231854 0.504942 0.458798 0.346234 0.175325 0.334019 0.767297
radius_se 0.143048 0.679090 0.275869 0.691765 0.732562 0.301467 0.497473 0.631925 0.698050 0.303379 ... 0.715065 0.194799 0.719684 0.751548 0.141919 0.287103 0.380585 0.531062 0.094543 0.049559
texture_se -0.007526 -0.097317 0.386358 -0.086761 -0.066280 0.068406 0.046205 0.076218 0.021480 0.128053 ... -0.111690 0.409003 -0.102242 -0.083195 -0.073658 -0.092439 -0.068956 -0.119638 -0.128215 -0.045655
perimeter_se 0.137331 0.674172 0.281673 0.693135 0.726628 0.296092 0.548905 0.660391 0.710650 0.313893 ... 0.697201 0.200371 0.721031 0.730713 0.130054 0.341919 0.418899 0.554897 0.109930 0.085433
area_se 0.177742 0.735864 0.259845 0.744983 0.800086 0.246552 0.455653 0.617427 0.690299 0.223970 ... 0.757373 0.196497 0.761213 0.811408 0.125389 0.283257 0.385100 0.538166 0.074126 0.017539
smoothness_se 0.096781 -0.222600 0.006614 -0.202694 -0.166777 0.332375 0.135299 0.098564 0.027653 0.187321 ... -0.230691 -0.074743 -0.217304 -0.182195 0.314457 -0.055558 -0.058298 -0.102007 -0.107342 0.101480
compactness_se 0.033961 0.206000 0.191975 0.250744 0.212583 0.318943 0.738722 0.670279 0.490424 0.421659 ... 0.204607 0.143003 0.260516 0.199371 0.227394 0.678780 0.639147 0.483208 0.277878 0.590973
concavity_se 0.055239 0.194204 0.143293 0.228082 0.207660 0.248396 0.570517 0.691270 0.439167 0.342627 ... 0.186904 0.100241 0.226680 0.188353 0.168481 0.484858 0.662564 0.440472 0.197788 0.439329
points_se 0.078768 0.376169 0.163851 0.407217 0.372320 0.380676 0.642262 0.683260 0.615634 0.393298 ... 0.358127 0.086741 0.394999 0.342271 0.215351 0.452888 0.549592 0.602450 0.143116 0.310655
symmetry_se -0.017306 -0.104321 0.009127 -0.081629 -0.072497 0.200774 0.229977 0.178009 0.095351 0.449137 ... -0.128121 -0.077473 -0.103753 -0.110343 -0.012662 0.060255 0.037119 -0.030413 0.389402 0.078079
dimension_se 0.025725 -0.042641 0.054458 -0.005523 -0.019887 0.283607 0.507318 0.449301 0.257584 0.331786 ... -0.037488 -0.003195 -0.001000 -0.022736 0.170568 0.390159 0.379975 0.215204 0.111094 0.591328
radius_worst 0.082405 0.969539 0.352573 0.969476 0.962746 0.213120 0.535315 0.688236 0.830318 0.185728 ... 1.000000 0.359921 0.993708 0.984015 0.216574 0.475820 0.573975 0.787424 0.243529 0.093492
texture_worst 0.064720 0.297008 0.912045 0.303038 0.287489 0.036072 0.248133 0.299879 0.292752 0.090651 ... 0.359921 1.000000 0.365098 0.345842 0.225429 0.360832 0.368366 0.359755 0.233027 0.219122
perimeter_worst 0.079986 0.965137 0.358040 0.970387 0.959120 0.238853 0.590210 0.729565 0.855923 0.219169 ... 0.993708 0.365098 1.000000 0.977578 0.236775 0.529408 0.618344 0.816322 0.269493 0.138957
area_worst 0.107187 0.941082 0.343546 0.941550 0.959213 0.206718 0.509604 0.675987 0.809630 0.177193 ... 0.984015 0.345842 0.977578 1.000000 0.209145 0.438296 0.543331 0.747419 0.209146 0.079647
smoothness_worst 0.010338 0.119616 0.077503 0.150549 0.123523 0.805324 0.565541 0.448822 0.452753 0.426675 ... 0.216574 0.225429 0.236775 0.209145 1.000000 0.568187 0.518523 0.547691 0.493838 0.617624
compactness_worst -0.002968 0.413463 0.277830 0.455774 0.390410 0.472468 0.865809 0.754968 0.667454 0.473200 ... 0.475820 0.360832 0.529408 0.438296 0.568187 1.000000 0.892261 0.801080 0.614441 0.810455
concavity_worst 0.023203 0.526911 0.301025 0.563879 0.512606 0.434926 0.816275 0.884103 0.752399 0.433721 ... 0.573975 0.368366 0.618344 0.543331 0.518523 0.892261 1.000000 0.855434 0.532520 0.686511
points_worst 0.035174 0.744214 0.295316 0.771241 0.722017 0.503053 0.815573 0.861323 0.910155 0.430297 ... 0.787424 0.359755 0.816322 0.747419 0.547691 0.801080 0.855434 1.000000 0.502528 0.511114
symmetry_worst -0.044224 0.163953 0.105008 0.189115 0.143570 0.394309 0.510223 0.409464 0.375744 0.699826 ... 0.243529 0.233027 0.269493 0.209146 0.493838 0.614441 0.532520 0.502528 1.000000 0.537848
dimension_worst -0.029866 0.007066 0.119205 0.051019 0.003738 0.499316 0.687382 0.514930 0.368661 0.438413 ... 0.093492 0.219122 0.138957 0.079647 0.617624 0.810455 0.686511 0.511114 0.537848 1.000000

31 rows × 31 columns

In [13]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 569 non-null    int64  
 1   diagnosis          569 non-null    object 
 2   radius_mean        569 non-null    float64
 3   texture_mean       569 non-null    float64
 4   perimeter_mean     569 non-null    float64
 5   area_mean          569 non-null    float64
 6   smoothness_mean    569 non-null    float64
 7   compactness_mean   569 non-null    float64
 8   concavity_mean     569 non-null    float64
 9   points_mean        569 non-null    float64
 10  symmetry_mean      569 non-null    float64
 11  dimension_mean     569 non-null    float64
 12  radius_se          569 non-null    float64
 13  texture_se         569 non-null    float64
 14  perimeter_se       569 non-null    float64
 15  area_se            569 non-null    float64
 16  smoothness_se      569 non-null    float64
 17  compactness_se     569 non-null    float64
 18  concavity_se       569 non-null    float64
 19  points_se          569 non-null    float64
 20  symmetry_se        569 non-null    float64
 21  dimension_se       569 non-null    float64
 22  radius_worst       569 non-null    float64
 23  texture_worst      569 non-null    float64
 24  perimeter_worst    569 non-null    float64
 25  area_worst         569 non-null    float64
 26  smoothness_worst   569 non-null    float64
 27  compactness_worst  569 non-null    float64
 28  concavity_worst    569 non-null    float64
 29  points_worst       569 non-null    float64
 30  symmetry_worst     569 non-null    float64
 31  dimension_worst    569 non-null    float64
dtypes: float64(30), int64(1), object(1)
memory usage: 142.4+ KB
In [15]:
df.shape
Out[15]:
(569, 32)
In [16]:
df.describe()
Out[16]:
id radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean points_mean symmetry_mean ... radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst points_worst symmetry_worst dimension_worst
count 5.690000e+02 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 ... 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000
mean 3.037183e+07 14.127292 19.289649 91.969033 654.889104 0.096360 0.104341 0.088799 0.048919 0.181162 ... 16.269190 25.677223 107.261213 880.583128 0.132369 0.254265 0.272188 0.114606 0.290076 0.083946
std 1.250206e+08 3.524049 4.301036 24.298981 351.914129 0.014064 0.052813 0.079720 0.038803 0.027414 ... 4.833242 6.146258 33.602542 569.356993 0.022832 0.157336 0.208624 0.065732 0.061867 0.018061
min 8.670000e+03 6.981000 9.710000 43.790000 143.500000 0.052630 0.019380 0.000000 0.000000 0.106000 ... 7.930000 12.020000 50.410000 185.200000 0.071170 0.027290 0.000000 0.000000 0.156500 0.055040
25% 8.692180e+05 11.700000 16.170000 75.170000 420.300000 0.086370 0.064920 0.029560 0.020310 0.161900 ... 13.010000 21.080000 84.110000 515.300000 0.116600 0.147200 0.114500 0.064930 0.250400 0.071460
50% 9.060240e+05 13.370000 18.840000 86.240000 551.100000 0.095870 0.092630 0.061540 0.033500 0.179200 ... 14.970000 25.410000 97.660000 686.500000 0.131300 0.211900 0.226700 0.099930 0.282200 0.080040
75% 8.813129e+06 15.780000 21.800000 104.100000 782.700000 0.105300 0.130400 0.130700 0.074000 0.195700 ... 18.790000 29.720000 125.400000 1084.000000 0.146000 0.339100 0.382900 0.161400 0.317900 0.092080
max 9.113205e+08 28.110000 39.280000 188.500000 2501.000000 0.163400 0.345400 0.426800 0.201200 0.304000 ... 36.040000 49.540000 251.200000 4254.000000 0.222600 1.058000 1.252000 0.291000 0.663800 0.207500

8 rows × 31 columns

In [17]:
round(df.describe(),2) # round up to 2 zeros
Out[17]:
id radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean points_mean symmetry_mean ... radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst points_worst symmetry_worst dimension_worst
count 5.690000e+02 569.00 569.00 569.00 569.00 569.00 569.00 569.00 569.00 569.00 ... 569.00 569.00 569.00 569.00 569.00 569.00 569.00 569.00 569.00 569.00
mean 3.037183e+07 14.13 19.29 91.97 654.89 0.10 0.10 0.09 0.05 0.18 ... 16.27 25.68 107.26 880.58 0.13 0.25 0.27 0.11 0.29 0.08
std 1.250206e+08 3.52 4.30 24.30 351.91 0.01 0.05 0.08 0.04 0.03 ... 4.83 6.15 33.60 569.36 0.02 0.16 0.21 0.07 0.06 0.02
min 8.670000e+03 6.98 9.71 43.79 143.50 0.05 0.02 0.00 0.00 0.11 ... 7.93 12.02 50.41 185.20 0.07 0.03 0.00 0.00 0.16 0.06
25% 8.692180e+05 11.70 16.17 75.17 420.30 0.09 0.06 0.03 0.02 0.16 ... 13.01 21.08 84.11 515.30 0.12 0.15 0.11 0.06 0.25 0.07
50% 9.060240e+05 13.37 18.84 86.24 551.10 0.10 0.09 0.06 0.03 0.18 ... 14.97 25.41 97.66 686.50 0.13 0.21 0.23 0.10 0.28 0.08
75% 8.813129e+06 15.78 21.80 104.10 782.70 0.11 0.13 0.13 0.07 0.20 ... 18.79 29.72 125.40 1084.00 0.15 0.34 0.38 0.16 0.32 0.09
max 9.113205e+08 28.11 39.28 188.50 2501.00 0.16 0.35 0.43 0.20 0.30 ... 36.04 49.54 251.20 4254.00 0.22 1.06 1.25 0.29 0.66 0.21

8 rows × 31 columns

In [18]:
sns.pairplot(df,hue='diagnosis',palette='Set1')
Out[18]:
<seaborn.axisgrid.PairGrid at 0x2223ed94708>
In [19]:
df['diagnosis'].value_counts() # value count of target variable size 
Out[19]:
B    357
M    212
Name: diagnosis, dtype: int64
In [20]:
sns.countplot(data=df, x='diagnosis')
Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x22260963cc8>

lets Normalize our data Featurs Scalling of our data

In [21]:
from sklearn.preprocessing import StandardScaler
In [22]:
scaler = StandardScaler()
In [23]:
scaler.fit(df.drop('diagnosis',axis=1))
Out[23]:
StandardScaler(copy=True, with_mean=True, with_std=True)
In [24]:
scaled_features = scaler.transform(df.drop('diagnosis',axis=1))
In [29]:
df_feat = pd.DataFrame(scaled_features,columns=df.columns[:-1])
df_feat.head()
Out[29]:
id diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean points_mean ... dimension_se radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst points_worst symmetry_worst
0 0.454465 -0.513297 -1.605595 -0.540376 -0.542624 0.458285 -0.654413 -0.614306 -0.307442 0.538081 ... -0.573451 -1.634499 -0.604391 -0.582718 0.268776 -0.812128 -0.709978 -0.315133 -0.119321 -0.899721
1 -0.171815 -1.001801 -0.079038 -0.934566 -0.877375 0.036986 0.196319 -0.312987 -0.580343 0.402996 ... -0.908925 -0.445740 -0.863230 -0.801227 -0.485202 -0.017590 -0.386625 -0.538203 0.063489 -0.447528
2 -0.235899 -0.876835 -0.572377 -0.867014 -0.801153 0.806996 -0.498443 -0.732448 -0.622129 -0.356400 ... -0.799171 0.124213 -0.814083 -0.719308 0.198638 -0.674722 -0.793935 -0.613574 0.157320 -0.284606
3 -0.236192 -0.808671 -1.372888 -0.781338 -0.768161 1.426135 0.175472 -0.532950 -0.024740 -0.148296 ... -0.900641 -1.613330 -0.915355 -0.785054 0.189871 -0.458434 -0.889933 -0.433901 -1.292212 -0.892517
4 -0.170996 0.301824 -1.414775 0.234000 0.161860 -1.190618 -0.663320 -0.688883 -0.576473 -0.330843 ... -0.014328 -1.619844 -0.082245 -0.108082 -0.866574 -0.512506 -0.652408 -0.499832 -0.669366 -0.902492

5 rows × 31 columns

Train test split

In [30]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
In [31]:
X = scaled_features
y = df['diagnosis']
In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

SVM Classifiers

In [33]:
#import SVC classifier
from sklearn.svm import SVC
#import metrics to compute accuracy
from sklearn.metrics import accuracy_score
In [34]:
#initiacte classifier
svc = SVC()
In [35]:
svc.fit(X_train,y_train)
Out[35]:
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

Prediction and Evaluation

In [36]:
predictions = svc.predict(X_test)
In [37]:
from sklearn.metrics import classification_report,confusion_matrix
In [38]:
print(confusion_matrix(y_test,predictions))
[[119   2]
 [  1  66]]
In [39]:
print(classification_report(y_test,predictions))
              precision    recall  f1-score   support

           B       0.99      0.98      0.99       121
           M       0.97      0.99      0.98        67

    accuracy                           0.98       188
   macro avg       0.98      0.98      0.98       188
weighted avg       0.98      0.98      0.98       188

In [40]:
print('Model Accuracy Score: {0:0.4f}'.format(accuracy_score(y_test,predictions))) # Accuracy predictions 
Model Accuracy Score: 0.9840

Create Classification SVM classification linear object

SVM classifiction linear Kernel object

In [42]:
model_linear = SVC(kernel = "linear")
In [43]:
model_linear.fit(X_train,y_train)
Out[43]:
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

Prediction and Evaluation

In [44]:
pred_test_linear = model_linear.predict(X_test)
In [45]:
np.mean(pred_test_linear==y_test) 
Out[45]:
0.9840425531914894

Kernel poly

In [46]:
model_poly = SVC(kernel = "poly")
In [47]:
model_poly.fit(X_train,y_train)
Out[47]:
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='poly',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

Prediction and evaluation

In [48]:
pred_test_poly = model_poly.predict(X_test)
In [49]:
np.mean(pred_test_poly==y_test) 
Out[49]:
0.925531914893617

Kernel RBF

In [50]:
model_rbf = SVC(kernel = "rbf")
In [51]:
model_rbf.fit(X_train,y_train)
Out[51]:
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

Predicition and Evaluation

In [52]:
pred_test_rbf = model_rbf.predict(X_test)
In [53]:
np.mean(pred_test_rbf==y_test) 
Out[53]:
0.9840425531914894

Kernel Sigmoid

In [54]:
model_sig = SVC(kernel="sigmoid")
In [55]:
model_sig.fit(X_train,y_train)
Out[55]:
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='sigmoid',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

Predicition and Evaluation

In [56]:
pred_test_sig = model_sig.predict(X_test)
In [57]:
np.mean(pred_test_sig==y_test)
Out[57]:
0.9787234042553191
In [ ]: